******************************************************************************************************************
* This contruction file merges all the raw data files shared with us by Hired.com
* to construct the dataset for final analysis

*********************************************************************************************************************
* Input (raw) csv data:
*compensation
*matched_education
*most_recent_company
*company_id_benefits
*platform_companies_info
*jobtitles
*top five skills
*candidate_wishlist_data
*candidate_offers

* Output:
* SGenderMetaD

* Disclaimer: the code below is not optimized for minimal length / run time
* Rather it aims at maximizing clarity of what data cleaning and variable construction
* is performed, as well as minimizing the chances of coding errors. 

***********************************
**  I/ Load and clean the  data  **
***********************************

*------------------------------------------------------------------------------*
**	 1. Import and clean the equity file  **
import delimited using "$indata/compensation.csv", clear

	* Restrict to jobs we can match 
	drop if mi(jobid)

	* Align nomenclature
	ren candidateid sid
	ren equity equityval
	
	* Generate the equity variable
	gen equity = 1 if equityval == "yes"
	replace equity = 0 if equityval == "no"
	drop equityval
	keep equity jobid sid
	duplicates drop

	* save compensation file
	tempfile compensation
	save `compensation'
	
*------------------------------------------------------------------------------*


*------------------------------------------------------------------------------*
**	2. Import and clean the education file **

import delimited using "$indata/matched_education.csv" , clear

	* Define graduation year as graduation year of highest degree
	bys sid: replace graduation_year=. if normalized_degree != highest_degree
	ren graduation_year s_grad_year
	
	* Date cleaning
	replace s_grad_year = 2017 if s_grad_year==20017
	replace s_grad_year = 2013 if s_grad_year==13
	replace s_grad_year = 1996 if s_grad_year==96
	replace s_grad_year = 1979 if s_grad_year==79
	replace s_grad_year = . if !inrange(s_grad_year,1960,2025)

	* Destring the highest degree variable
	tab highest_degree, mi
	gen cat_degree = 0 if highest_degree == "highschool"
	replace cat_degree = 1 if highest_degree == "associate"
	replace cat_degree = 2 if highest_degree == "bachelor"
	replace cat_degree = 3 if highest_degree == "master"
	replace cat_degree = 4 if highest_degree == "mba"
	replace cat_degree = 5 if highest_degree == "phd"
	labmask cat_degree, values(highest_degree)
	drop highest_degree


	* Create the a dummy for Ivy League + 
	gen ivy = 0
	quietly foreach uni in  "brown university" "columbia university" ///
	 "cornell university" "dartmouth college" "harvard university" ///
	 "princeton university" "university of pennsylvania" ///
	 "massachusetts institute of technology" ///
	 "yale university" "university of chicago" "university of california berkeley" ///
	 "carnegie mellon university" "duke university" ///
	 "california institute of technology caltech" "georgia tech" { 
		   replace ivy = 1 if strpos(university, `"`uni'"') 
	} 
	* Define having attended Ivy League + education at the sid level
	* Note: the max of a missing and the number is the number, which is what we want
	bys sid: egen ivyplus = max(ivy)
	label define dummy 1 "yes" 0 "no" 
	label values ivyplus dummy 
	tab ivyplus
	

	* Create dummy for whether the person has a degree in cs
	* Define study_field as field of study of highest degree
	bys sid: replace normalized_field="" if normalized_degree != highest_degree
	ren normalized_field field_study
	gen cs = 0
	replace cs = 1 if field_study == 1
	* Define having a CS degree at the sid level
	bys sid: egen csdegree = max(cs)
	tab csdegree

	* Bin the university ranking into categorical variable
	bys sid: egen avg_educ_qual = mean(world_rank)
	gen ranking = 1 if inrange(avg_educ_qual, 1, 20)
	replace ranking = 2 if inrange(avg_educ_qual, 20.1, 100)
	replace ranking = 3 if inrange(avg_educ_qual, 100.1, 500)
	replace ranking = 4 if inrange(avg_educ_qual, 500.1, 1000)
	replace ranking = 5 if inrange(avg_educ_qual, 1000.1, 5000)
	replace ranking = 6 if avg_educ_qual>5000 & !mi(avg_educ_qual)
	replace ranking = 100 if mi(avg_educ_qual)
	label define lrank 100 "missing" 1 "1-20" 2 "21-100" 3 "101-500" 4 "501-1000" 5 "1001-5000" 6 "5000+"
	label values ranking lrank

	* drop the duplicates
	keep sid cat_degree ivyplus csdegree s_grad_year avg_educ_qual ranking 
	drop if mi(s_grad_year)
	duplicates drop
	duplicates tag sid , generate(tag)
	drop if tag >= 1 & mi(cat_degree)
	drop tag

	* Collapse to sid level of highest/most recent education
	sort sid s_grad_year
	collapse (last) cat_degree ivyplus csdegree s_grad_year  ///
					avg_educ_qual ranking, by(sid)

	* save education file
	tempfile education
	save `education'
*------------------------------------------------------------------------------*



*------------------------------------------------------------------------------*
** 3. Import and clean the candidate companies file **

import delimited using "$indata/most_recent_company.csv", clear

	rename candidate_id sid
	drop company_name
	ren name_merge company_name

	* Generate FAANG dummy: if person has worked at one of Facebook, Apple, Amazon, Netflix, Google
	gen google = (strpos(company_name,"google") >0)
	gen facebook = (strpos(company_name,"facebook") >0)
	gen amazon = (strpos(company_name, "amazon") >0)
	gen apple = (strpos(company_name, "apple") >0)
	gen netflix =(strpos( company_name, "netflix") >0)

	gen faang = 1 if google | facebook | amazon | netflix
	replace faang = 0 if mi(faang)
	tab faang
	drop  google facebook amazon netflix apple

	* Define Faang at the sid level
	bysort sid: egen faang_max = max(faang)
	drop faang 
	ren faang_max faang
	label values faang dummy 
	tab faang

	* Now we can focus solely on the most recent companies
	keep if recent_company == 1


	* Select variables
	keep sid faang company_name 

	duplicates drop
	drop if mi(sid)

	* Save the company file
	tempfile company
	save `company'
	
*------------------------------------------------------------------------------*



*------------------------------------------------------------------------------*
** 4. Import and clean the Hired companies' benefits files **

* benefits
import delimited using "$indata/company_id_benefits.csv", clear

	* For some companies we have observations with benefits from other cities, 
	* I keep those observations with the most listed benefits
	duplicates tag companyid, gen(tag)
	bys companyid: egen maxentry = max(numberofbenefits) if tag>0
	drop if numberofbenefits != maxentry & !mi(maxentry)
	drop tag maxentry

	replace numberofbenefits=0 if mi(numberofbenefits)

	* Create dummies for offered types of benefits
	gen performance_bonus = (strpos(benefits, "Performance bonus")>0) 
	gen stocks = (strpos(benefits, "Stock")>0)
	gen pension_plans = (strpos(benefits, "401k")>0) | ///
						(strpos(benefits, "pension")>0) | ///
						(strpos(benefits, "FSA")>0)
	gen cummute_friendly = (strpos(benefits, "Commuter")>0) | ///
						   (strpos(benefits, "parking")>0) | ///
						   (strpos(benefits, "transportation")>0) 
	gen health_ins = (strpos(benefits, "Health insurance")>0)
	gen dental_ins = (strpos(benefits, "Dental insurance")>0)
	gen vision_ins = (strpos(benefits, "Vision insurance")>0)
	gen life_ins = (strpos(benefits, "Life insurance")>0)
	gen disability_ins = (strpos(benefits, "Disability insurance")>0)
	gen emply_assist =  (strpos(benefits, "Employee Assistance Program")>0) | ///
						(strpos(benefits, "legal")>0)
	gen fitness = (strpos(benefits, "fitness")>0) | ///
				  (strpos(benefits, "Fitness")>0) | ///
				  (strpos(benefits, "Showers")>0)
	gen family_benefits = (strpos(benefits, "Maternity")>0) | ///
						  (strpos(benefits, "Paternity")>0) | ///
						  (strpos(benefits, "Fertility")>0) | ///
						  (strpos(benefits, "Adoption")>0) |  ///
						  (strpos(benefits, "Child")>0)
	gen tuition_reimb = (strpos(benefits, "Tuition")>0)
	gen flexibility = (strpos(benefits, "time off")>0) | ///
					  (strpos(benefits, "Sabbatical")>0) | ///
					  (strpos(benefits, "working hours")>0) | ///
					  (strpos(benefits, "holidays")>0) | ///
					  (strpos(benefits, "Work from home")>0) 
	gen training_on_job = (strpos(benefits, "training")>0) | ///
						  (strpos(benefits, "Conferences")>0) | ///
						  (strpos(benefits, "Diversity program")>0) | ///
						  (strpos(benefits, "Hackathons")>0) | ///
						  (strpos(benefits, "Internship program")>0)
	gen free_food = (strpos(benefits, "catered")>0) | ///
					(strpos(benefits, "Snacks")>0)
	gen charity = (strpos(benefits, "charitable")>0) | ///
				  (strpos(benefits, "Dog walking")>0)
	gen work_env = (strpos(benefits, "activities")>0) | ///
				   (strpos(benefits, "recreation")>0) | ///
				   (strpos(benefits, "Employee groups")>0)
	gen pet_friendly = (strpos(benefits, "Dog")>0) | ///
					   (strpos(benefits, "Pet")>0)
	gen discount =  (strpos(benefits, "discount")>0) 

	* save the benefits file
	tempfile benefits
	save `benefits'

		
*------------------------------------------------------------------------------*



*------------------------------------------------------------------------------*
** 5. Import and clean the Hired companies' characteristics files **

* companies' characteristics
import delimited using "$indata/platform_companies_info.csv", clear		

	* Select variables
	keep companyid companysize revenue industry founded  ///
		 numberofbenefits benefits performance_bonus stocks ///
		 pension_plans  health_ins dental_ins vision_ins life_ins  ///
		 disability_ins emply_assist fitness family_benefits tuition_reimb ///
		 flexibility training_on_job free_food charity  work_env pet_friendly ///
		 discount cummute_friendly
	duplicates drop

	* Clean Platform Company Size Variable
	ren companysize company_size		
	replace company_size = "8" if company_size == "1-15" | company_size == "1-15 employees"  ///
								| company_size == "less than 10"
	replace company_size = "32" if (strpos(company_size, "16")>0) | company_size == "11 to 50"
	replace company_size = "125" if (strpos(company_size, "51")>0)
	replace company_size = "350" if (strpos(company_size, "201")>0)
	replace company_size = "750" if company_size == "501-1000" | company_size == "501-1000 employees" 
	replace company_size = "1250" if (strpos(company_size, "1001")>0)
	replace company_size = "3250" if (strpos(company_size, "5000")>0)
	replace company_size = "10000" if (strpos(company_size, "5001")>0) | company_size == "more than 10,000"
	destring company_size, replace

	* Destring foundation year
	destring founded, replace

	* Pool Software Related Industries
	replace industry = "Enterprise Software" if industry == "Enterprise Softwar"
	replace industry = "Enterprise Software" if  industry == "Technology Infrastructure"
	replace industry = "Information Technology" if industry == "Information Tech" 
	replace industry = "Insurance" if industry == "Insuranc"
	replace industry = "News" if industry == "New"
	replace industry = "Human Resources & Careers" if industry == "Staffing & Subcontracting"
	replace industry = "Insurance" if industry == "Finance/Insurance"
	replace industry = "Digital Communication" if industry == "Telecommunications"
	replace industry = "Information Technology" if industry == "Information Technology Support Services"
	replace industry = "Real Estate & Property Management" if industry == "Real Estate"
	replace industry = "Electric Energy & Natural Gas" if industry == "Oil & Gas" | industry == "Energy"
	replace industry = "Biotechnology & Chemical Products" if industry== "Biotechnology"
  
	encode industry, gen(industry_drop)
	drop industry
	ren industry_drop industry	
		
	* rename variables as information about companies that use Hired.com
	foreach var in revenue company_size founded industry ///
				numberofbenefits benefits performance_bonus stocks ///
				pension_plans  health_ins dental_ins vision_ins life_ins  ///
				disability_ins emply_assist fitness family_benefits ///
				tuition_reimb flexibility training_on_job free_food charity ///
				work_env pet_friendly discount cummute_friendly{
					rename `var' platform_`var'
				}
			
	* Save the platform company info file		
	tempfile platform
	save `platform'			

*------------------------------------------------------------------------------*




*------------------------------------------------------------------------------*
** 6. Import and clean the jobtitles file **
import delimited using "$indata/jobtitles.csv", clear
drop v1

	* Define job positions from strings
	replace position_name_norm = "cto " if position_name_norm == "cto"
	
	gen junior = (strpos(position_name_norm, "junior")>0)
	gen senior = (strpos(position_name_norm, "senior")>0) | ///
				 (strpos(position_name_norm, "ii")>0) | ///
				 (strpos(position_name_norm, "2")>0)
	gen manager = (strpos(position_name_norm, "manager")>0) 
	gen lead =  (strpos(position_name_norm, "lead")>0) | ///
				(strpos(position_name_norm, "principle")>0) | ///
				(strpos(position_name_norm, "owner")>0)
	gen head = (strpos(position_name_norm, "head")>0) 
	gen director = (strpos(position_name_norm, "direct")>0) 
	gen vp = (strpos(position_name_norm, "vp")>0) | ///
			 (strpos(position_name_norm, "president")>0) | ///
			 (strpos(position_name_norm, "executive director")>0) | ///
			 (strpos(position_name_norm, "managing director")>0)
	gen founder = (strpos(position_name_norm, "founder")>0)  |  ///
				  (strpos(position_name_norm, "chief")>0)  | ///
				  (strpos(position_name_norm, "ceo")>0)  |  ///
				  (strpos(position_name_norm, "cto ")>0)  | ///
				  (strpos(position_name_norm, "officer")>0) | ///
				  (strpos(position_name_norm, "cmo")>0) | ///
				  (strpos(position_name_norm, "cpo")>0) | ///
				  (strpos(position_name_norm, "cro")>0)
	gen engineer =  (strpos(position_name_norm, "engineer")>0) | ///
					(strpos(position_name_norm, "developer")>0) | ///
					(strpos(position_name_norm, "archite")>0)  | ///
					(strpos(position_name_norm, "program")>0) | ///
					(strpos(position_name_norm, "develop")>0)
	gen consultant = (strpos(position_name_norm, "consultant")>0) 
	gen designer = (strpos(position_name_norm, "design")>0) 


	* Define highest job title on sid level
	collapse (max) junior senior manager lead head director vp founder ///
				   engineer consultant designer (first) linkedin website, by(sid)

	gen highest_jobtitle = 0 if junior == 1
	replace highest_jobtitle = 1 if senior == 1 
	replace highest_jobtitle = 2 if manager == 1 
	replace highest_jobtitle = 3 if lead == 1 
	replace highest_jobtitle = 4 if head == 1 
	replace highest_jobtitle = 5 if director == 1 
	replace highest_jobtitle = 6 if vp == 1 
	replace highest_jobtitle = 7 if engineer == 1 & mi(highest_jobtitle)
	replace highest_jobtitle = 8 if consultant == 1 & mi(highest_jobtitle)
	replace highest_jobtitle = 9 if designer == 1 & mi(highest_jobtitle)
	replace highest_jobtitle = 10 if founder == 1 & mi(highest_jobtitle)
	replace highest_jobtitle = 11 if mi(highest_jobtitle)

	label define highest_jobtitles 0 "junior" 1 "senior" 2 "manager" 3 "lead" ///
								   4 "head" 5 "director" 6 "vp" 7 "eng" ///
								   8 "consultant" 9 "designer" 10 "founder" 11 "other"
	label values highest_jobtitle highest_jobtitles 

	* Create dummies whether person has linkedin or website link on profile
	gen linkedin2 = (linkedin=="t")
	gen website2 = (website=="t")
	drop linkedin  website
	ren linkedin2 linkedin
	ren website2 website
	
	* Restrict and save jobtitle file
	keep sid founder highest_jobtitle linkedin website

	tempfile jobtitle
	save `jobtitle'
*------------------------------------------------------------------------------*




*------------------------------------------------------------------------------*
** 7. Import the skills file **
import delimited  using "$indata/top five skills.csv", clear rowr(2:)

	* rename variables
	ren v1 sid
	ren v2 skill

	* format variables
	destring  sid , replace force
	duplicates drop sid, force

	* remove empty columns
	drop v3 v4 v5

	* data pre-processing
	replace skill = lower(skill)
	replace skill = subinstr(skill," ","",.)
	replace skill = "," + skill + "," 

	* separate each skill so we can get a count of the most frequent ones
	split skill, parse(",")

	* skill1 is empty and skill7-skill18 are essentially empty as well
	drop skill1 skill7-skill18

	* 50 most frequent skills : represent 95% all skills
	groups skill2, order(h) select(50)

	* Now use these top 50 skills to create skill groups dummies by
	* grouping the ones that are similar
	foreach j in "" {

	gen blockchain`j' = (strpos(skill`j', "blockchain") > 0) 
	gen java`j' = (strpos(skill`j', "java,") > 0) 
	gen excel`j' =  (strpos(skill`j', "microsoftoffice") > 0) | ///
					(strpos(skill`j', "msoffice") > 0)  | ///
					(strpos(skill`j', "excel") > 0) 
	gen leadership`j' = (strpos(skill`j', "leadership") > 0) 
	gen projectmanagement`j' =  (strpos(skill`j', "projectmanagement") > 0) | ///
								(strpos(skill`j', "accountmanagement") > 0)
	gen html`j' = (strpos(skill`j', "html") > 0)
	gen python`j' = (strpos(skill`j', "python") > 0) 
	gen javascript`j' = (strpos(skill`j', "javascript") > 0) | ///
						(strpos(skill`j', "jquery") > 0)
	gen adobe`j' =  (strpos(skill`j', "adobe") > 0) | ///
					(strpos(skill`j', "photoshop") > 0) | ///
					(strpos(skill`j', "illustrator") > 0)
	gen product`j' = (strpos(skill`j', "product") > 0)
	gen businessdevelopment`j' = (strpos(skill`j', "business") > 0)
	gen sql`j' = (strpos(skill`j', "sql") > 0) 
	gen socialmedia`j' = (strpos(skill`j', "socialmedia") > 0)
	gen enterprise`j' = (strpos(skill`j', "enterprise") > 0)
	gen dataanalysis`j' = (strpos(skill`j', "dataanalysis") > 0) | ///
						  (strpos(skill`j', "dataanalytics") > 0) | ///
						  (strpos(skill`j', "datamanagement") > 0)
	gen pointnet`j' = (strpos(skill`j', ".net") > 0)
	gen c`j' =  (strpos(skill`j', ",c,") > 0) | ///
				(strpos(skill`j', "c/c++") > 0) | ///
				(strpos(skill`j', "c++") > 0) | ///
				(strpos(skill`j', "-c") > 0)
	gen customerservice`j' = (strpos(skill`j', "customerservice") > 0)
	gen php`j' = (strpos(skill`j', "php") > 0) 
	gen ios`j' = (strpos(skill`j', ",ios,") > 0)
	gen android`j' = (strpos(skill`j', "android") > 0)
	gen chash`j' = (strpos(skill`j', "c#") > 0)
	gen marketing`j' =  (strpos(skill`j', "marketing") > 0) | ///
						(strpos(skill`j', "leadgeneration") > 0) 
	gen r`j' = (strpos(skill`j', ",r,") > 0)
	gen saas`j' = (strpos(skill`j', "saas") > 0)
	gen design`j' = (strpos(skill`j', "design") > 0)
	gen ruby`j' = (strpos(skill`j', "ruby") > 0) 
	gen nodejs`j' = (strpos(skill`j', "node.js") > 0) | ///
					(strpos(skill`j', "nodejs") > 0)
	gen machinelearning`j' = (strpos(skill`j', "machinelearning") > 0)
	gen linux`j' = (strpos(skill`j', "linux") > 0) 
	gen sales`j' =  (strpos(skill`j', ",sales,") > 0) | ///
					(strpos(skill`j', "salesoperations") > 0) | ///
					(strpos(skill`j', "salesmanagement") > 0)  
	gen agile`j' = (strpos(skill`j', "agile") > 0) 
	gen css`j' = (strpos(skill`j', "css") > 0)
	gen react`j' = (strpos(skill`j', "react") > 0)
	gen midmarket`j' = (strpos(skill`j', "mid-market") > 0)
	gen testing`j' = (strpos(skill`j', "testing") > 0) 
	gen sketch`j' = (strpos(skill`j', "sketch") > 0)
	gen strategy`j' = (strpos(skill`j', "strateg") > 0) 
	gen angular`j' = (strpos(skill`j', "angular") > 0) 
	gen smb`j' = (strpos(skill`j', "smb") > 0)
	gen autocad`j' = (strpos(skill`j', "autocad") > 0)
	gen swift`j' = (strpos(skill`j', "swift") > 0)
	gen communication`j' = (strpos(skill`j', "communication") > 0)
	gen leadgeneration`j' = (strpos(skill`j', "leadgeneration") > 0)
	gen businessanalysis`j' = (strpos(skill`j', "businessanalysis") > 0)
	gen testautomation`j' = (strpos(skill`j', "testautomation") > 0)
	gen webdesign`j' = (strpos(skill`j', "webdesign") > 0)
	gen recruiting`j' = (strpos(skill`j', "recruiting") > 0)
	gen digitalstrategy`j' = (strpos(skill`j', "digitalstrategy") > 0)
	gen go`j' = (strpos(skill`j', ",go,") > 0)
	gen tableau`j' = (strpos(skill`j', "tableau") > 0)
	gen matlab`j' = (strpos(skill`j', "matlab") > 0) 
	gen uidesign`j' =   (strpos(skill`j', "uidesign") > 0) | ///
						(strpos(skill`j', "ui/uxdesign") > 0) | ///
						(strpos(skill`j', ",ui,") > 0)  | ///
						(strpos(skill`j', "uxdesign") > 0) 
	gen amazon`j' = (strpos(skill`j', "amazonwebservice") > 0) | ///
					(strpos(skill`j', "aws") > 0)
	gen salesforce`j' =  (strpos(skill`j', "salesforce") > 0)
	gen hadoop`j' = (strpos(skill`j', "hadoop") > 0)
	gen asp`j' = (strpos(skill`j', "asp.net") > 0) | ///
				 (strpos(skill`j', "asp.netmvc") > 0) | ///
				 (strpos(skill`j', ",asp,") > 0) 
	gen sas`j' = (strpos(skill`j', ",sas,") > 0) | ///
				 (strpos(skill`j', "sasprogramming") > 0)
	gen devops`j' = (strpos(skill`j', "devops") > 0)
	gen perl`j' = (strpos(skill`j', ",perl,") > 0)
	gen vba`j' = (strpos(skill`j', ",vba,") > 0)
	gen scala`j' = (strpos(skill`j', "scala") > 0)
	gen financialanalysis`j' = (strpos(skill`j', "financialanalysis") > 0) 
	gen research`j' = (strpos(skill`j', ",research,") > 0) 
	gen statistics`j' = (strpos(skill`j', "statistics") > 0) 
	gen visualisation`j' =  (strpos(skill`j', "visualisation ") > 0) | ///
							(strpos(skill`j', "2d") > 0) | ///
							(strpos(skill`j', "3d") > 0) 
	gen accounting`j' = (strpos(skill`j', "accounting ") > 0) | ///
						(strpos(skill`j', "accounts ") > 0)
	gen accountmanagement`j' = (strpos(skill`j', "accountmanagement ") > 0)
	gen activedirectory`j' = (strpos(skill`j', "activedir") > 0)
	gen advertising`j' = (strpos(skill`j', "advertising ") > 0)
	gen engineering`j' = (strpos(skill`j', "engineering ") > 0)
	gen algorithms`j' = (strpos(skill`j', "algorithm ") > 0)
	gen analysis`j' = (strpos(skill`j', "analytic") > 0)
	gen animation`j' = (strpos(skill`j', "animation") > 0)
	gen apache`j' = (strpos(skill`j', "apache") > 0)
	gen apex`j' = (strpos(skill`j', "apex") > 0)
	gen arcgis`j' = (strpos(skill`j', "arcgis") > 0)
	gen architecture`j' = (strpos(skill`j', "architectur") > 0)
	}

	* Save topskill file
	tempfile topskill
	save `topskill'
*------------------------------------------------------------------------------*



*------------------------------------------------------------------------------*
** 8. Import the candidate_wishlist_data file **
import delimited "$indata/candidate_wishlist_data.csv", clear 

	drop wishlist_id
	ren candidate_id sid

	* Bring dataset in correct wide format
	bys sid preferable_type : gen wish_ = value[1]
	bys sid preferable_type: replace wish_ = wish_[_n-1] + " , " + value if _n > 1
	bys sid preferable_type : replace wish_ = wish_[_N]
	drop value
	duplicates drop
	reshape wide wish_, i(sid) j(preferable_type) string

	* Define dummy variables for preferences
		
		* Preferred Company Size
		gen csize1_15 = (strpos(wish_CompanySize, "1_15") >0 )
		gen csize16_50 = (strpos(wish_CompanySize, "16_50") >0 )
		gen csize51_200 = (strpos(wish_CompanySize, "51_200") >0 ) 
		gen csize201_500 = (strpos(wish_CompanySize, "201_500") >0 ) 
		gen csize500_ = (strpos(wish_CompanySize, "501_1000") >0 ) // all obs. who list 501-1000 as preference also list 1500+
		
		* Preferred Career Path
		gen cpath_ind =  (strpos(wish_CareerPath, "individual_contributor") >0 )  
		gen cpath_manager =  (strpos(wish_CareerPath, "manager") >0 )  
		
		* Preferred Career Goal
		gen cgoal_culture = (strpos(wish_CareerGoal, "great_culture") >0 )  
		gen cgoal_mentor =  (strpos(wish_CareerGoal, "mentorship") >0 )   
		gen cgoal_tech =  (strpos(wish_CareerGoal, "new_technologies") >0 )  
		gen cgoal_lead =  (strpos(wish_CareerGoal, "leadership_opportunities") >0 ) 
		gen cgoal_lsoc =  (strpos(wish_CareerGoal, "socially_conscious") >0 )   
		gen cgoal_large = (strpos(wish_CareerGoal, "large_projects") >0 )  
		
		* Preferred Industry
		gen wishind_ag  = (strpos(wish_Industry, "agriculture_farming_forestry") >0 )  
		gen wishind_analytics  = (strpos(wish_Industry, "analytics_business_information") >0 )  
		gen wishind_auto  = (strpos(wish_Industry, "automotive") >0 )  
		gen wishind_av = (strpos(wish_Industry, "aviation_space") >0 )  
		gen wishind_bank = (strpos(wish_Industry, "banking_corporate_finance_investing") >0 )  
		gen wishind_biotech = (strpos(wish_Industry, "biotechnology_chemical_products") >0 )  
		gen wishind_game = (strpos(wish_Industry, "casinos_gaming") >0 )  
		gen wishind_cleantech = (strpos(wish_Industry, "clean_tech") >0 )  
		gen wishind_clothes = (strpos(wish_Industry, "clothing_fashion_textile") >0 )  
		gen wishind_cybersecurity = (strpos(wish_Industry, "cybersecurity") >0 )  
		gen wishind_dating = (strpos(wish_Industry, "dating_relationships") >0 )  
		gen wishind_com = (strpos(wish_Industry, "digital_communication") >0 )  
		gen wishind_pay = (strpos(wish_Industry, "digital_payments") >0 )  
		gen wishind_storage = (strpos(wish_Industry, "digital_storage") >0 )  
		gen wishind_ecommerce = (strpos(wish_Industry, "ecommerce") >0 )  
		gen wishind_education = (strpos(wish_Industry, "education") >0 ) 
		gen wishind_energy = (strpos(wish_Industry, "electric_energy_natural_gas") >0 )  
		gen wishind_entpsoft = (strpos(wish_Industry, "enterprise_software") >0 )  
		gen wishind_food = (strpos(wish_Industry, "food_drink") >0 )  
		gen wishind_gov = (strpos(wish_Industry, "government_public_administration") >0 )  
		gen wishind_hardware = (strpos(wish_Industry, "hardware_internet_of_things_electronics") >0 )  
		gen wishind_health = (strpos(wish_Industry, "health_care_technology_nursing") >0 )  
		gen wishind_hotel = (strpos(wish_Industry, "hotels_restaurants_leisure_") >0 )  
		gen wishind_hr = (strpos(wish_Industry, "human_resources_careers") >0 )  
		gen wishind_supplychain = (strpos(wish_Industry, "industrial_automation_supply_chain") >0 )  
		gen wishind_infosys = (strpos(wish_Industry, "information_systems") >0 )  
		gen wishind_insurance = (strpos(wish_Industry, "insurance") >0 )  
		gen wishind_legal = (strpos(wish_Industry, "legal_services") >0 )  
		gen wishind_news = (strpos(wish_Industry, "news_media_advertising_publishing") >0 )  
		gen wishind_nonprofit = (strpos(wish_Industry, "non_profit") >0 )  
		gen wishind_oilgas = (strpos(wish_Industry, "oil_gas") >0 )  
		gen wishind_wellness = (strpos(wish_Industry, "personal_fitness_wellness") >0 ) 
		gen wishind_perssafe = (strpos(wish_Industry, "personal_security_safety") >0 )  
		gen wishind_pubsafe = (strpos(wish_Industry, "public_safety") >0 )  
		gen wishind_realestate = (strpos(wish_Industry, "real_estate_property_management") >0 )  
		gen wishind_research = (strpos(wish_Industry, "research_management_consulting") >0 ) 
		gen wishind_retail = (strpos(wish_Industry, "retail_convenience_stores") >0 )  
		gen wishind_robotics = (strpos(wish_Industry, "robotics") >0 )  
		gen wishind_socialnet = (strpos(wish_Industry, "social_networking") >0 )  
		gen wishind_sports = (strpos(wish_Industry, "sports") >0 )  
		gen wishind_infra = (strpos(wish_Industry, "agriculture_farming_forestry") >0 )  
		gen wishind_logistics = (strpos(wish_Industry, "transportation_logistics") >0 )  
		gen wishind_art = (strpos(wish_Industry, "tv_music_film_theater") >0 )  
		gen wishind_video = (strpos(wish_Industry, "video") >0 )  

		* Preferred Skills
		gen wishskill_python = (strpos(wish_NormalizedSkill, "Python") >0 )
		gen wishskill_java = (strpos(wish_NormalizedSkill, "Java ") >0 )
		gen wishskill_javascript = (strpos(wish_NormalizedSkill, "JavaScript") >0 )
		replace wishskill_javascript = (strpos(wish_NormalizedSkill, "Javascript") >0 ) if wishskill_javascript==0
		gen wishskill_productmanagement = (strpos(wish_NormalizedSkill, "Product Management") >0 )
		gen wishskill_react = (strpos(wish_NormalizedSkill, "React") >0 )
		gen wishskill_html = (strpos(wish_NormalizedSkill, "HTML") >0 )
		gen wishskill_ml = (strpos(wish_NormalizedSkill, "Machine Learning") >0 )
		gen wishskill_sql = (strpos(wish_NormalizedSkill, "SQL") >0 )
		gen wishskill_cplus = (strpos(wish_NormalizedSkill, "C++") >0 )
		gen wishskill_dataanalysis = (strpos(wish_NormalizedSkill, "Data Analysis") >0 )
		gen wishskill_leadership = (strpos(wish_NormalizedSkill, "Leadership") >0 )
		gen wishskill_css = (strpos(wish_NormalizedSkill, "CSS") >0 )
		gen wishskill_node = (strpos(wish_NormalizedSkill, "Node.JS") >0 )
		gen wishskill_productdevelopment = (strpos(wish_NormalizedSkill, "Product Development") >0 )
		gen wishskill_csharp = (strpos(wish_NormalizedSkill, "C#") >0 )
		gen wishskill_go = (strpos(wish_NormalizedSkill, "Go") >0 )
		gen wishskill_adobesuite = (strpos(wish_NormalizedSkill, "Adobe Creative Suite") >0 )
		gen wishskill_aws = (strpos(wish_NormalizedSkill, "AWS") >0 )
		gen wishskill_management = (strpos(wish_NormalizedSkill, "Management") >0 )
		gen wishskill_design = (strpos(wish_NormalizedSkill, "Design") >0 )
		gen wishskill_r = (strpos(wish_NormalizedSkill, "R ") >0 )
		gen wishskill_busdev = (strpos(wish_NormalizedSkill, "Business Development") >0 )
		gen wishskill_typescript = (strpos(wish_NormalizedSkill, "TypeScript") >0 )
		gen wishskill_swift = (strpos(wish_NormalizedSkill, "Swift") >0 )
		gen wishskill_php = (strpos(wish_NormalizedSkill, "PHP") >0 )
		gen wishskill_ruby = (strpos(wish_NormalizedSkill, "Ruby") >0 )
		gen wishskill_office = (strpos(wish_NormalizedSkill, "Microsoft") >0 )
		gen wishskill_kubernetes = (strpos(wish_NormalizedSkill, "Kubernetes") >0 )
		gen wishskill_c = (strpos(wish_NormalizedSkill, "C ") >0 )
		gen wishskill_ios = (strpos(wish_NormalizedSkill, "iOS") >0 )

	* Save preference file
	tempfile wish
	save `wish'
*------------------------------------------------------------------------------*




*------------------------------------------------------------------------------*
** 9. Import and clean the candidate_offers file **
* This file contains contains details on every single bid firms sent for every job (so one line is at the job × candidate level)

import delimited "$indata/candidate_offers.csv", clear 


* the candidate_offers file by default saves the most recent ask that the candidate has inputed as their ask
* but this may not be the one that the company saw when they sent the interview request
* We use the history of asks and their change date to fix the ask variable

	* Bring variables in correct format
	destring sid, replace
	
	* Format Ask salaries
	destring firstsalary0, replace force
		
		forvalues i=0(1)95 {
		destring ssalary`i', replace force
		}

		* Format the dates of the salaries
		forvalues i = 0(1)95 {
			gen double date_obs`i' = clock(ssalarychangedate`i', "YMDhms")
			drop ssalarychangedate`i'
			ren date_obs`i' ssalarychangedate`i'
			format ssalarychangedate`i' %tc
		}
		
	* Define ask salary at time of bid
	gen s_salary_alter = firstsalary0
	
		forvalues i=0(1)95 {
		replace s_salary_alter = ssalary`i' if d_sent_date > ssalarychangedate`i' & !mi(ssalarychangedate`i')
		}
		replace s_salary = s_salary_alter if !mi(s_salary_alter)
		drop s_salary_alter 
	
*------------------------------------------------------------------------------*


********************************************************************************
** 						 II/ Merging all files 								  **
********************************************************************************

	* 1. merge in equity data
	merge m:1 jobid sid using `compensation'
	keep if _merge !=2
	drop _merge

	* 2. merge in education data
	merge m:1  sid using `education'
	keep if _merge !=2
	drop _merge

	* 3. merge in CV companies data
	merge m:1  sid using `company'
	keep if _merge !=2
	drop _merge

	* 4. merge in benefits data
	merge m:1 companyid using `benefits'
	keep if _merge !=2
	drop _m
	
	* 5. merge in company characteristics data
	merge m:1  companyid using `platform'
	keep if _merge !=2
	drop _m
	
	* 6. merge in skills data
	merge m:1  sid using `topskill'
	keep if _merge !=2
	drop _m
		
	* 7. merge in the wishlist data
	merge m:1  sid using `wish'
	keep if _merge !=2
	drop _m

	* 8. merge in jobtitle data
	merge m:1 sid using `jobtitle'
	keep if _merge !=2
	drop _m



********************************************************************************
** 						 III/ Variable cleaning and generation 				  **			  
********************************************************************************

* The data cleaning and variable creation below  
* is mostly processing the raw "candidate_offers" data

**** time stamps ****

	format batch_start_date %tc
	format d_sent_date %tc


	* extract the date when the candidate's profile is created 
	gen syear = yofd(dofc(batch_start_date))
	gen smonth = month(dofc(batch_start_date))
	gen sweek = week(dofc(batch_start_date))
	gen smonthyear =  ym(syear, smonth) 
	gen sweekyear = yw(syear ,sweek)
	format sweekyear %tw
	format smonthyear %tm
	drop sweek

	* extract the dates of the offer sent
	gen dyear = yofd(dofc(d_sent_date))
	gen dmonth = month(dofc(d_sent_date))
	gen dweek = week(dofc(d_sent_date))
	gen dmonthyear =  ym(dyear, dmonth) 
	gen dweekyear = yw(dyear ,dweek)
	format dweekyear %tw
	format dmonthyear %tm
	drop dweek

	* number of days between profile creation and a given offer
	gen long sbatch_start_date = dofc(batch_start_date)
	format sbatch_start_date %td
	gen long sd_sent_date = dofc(d_sent_date)
	format sd_sent_date %td
	gen nbdays = sd_sent_date - sbatch_start_date
	drop  sd_sent_date
	tab nbdays

	* total length of the spell (in days)
	gen long sbatch_close_date = dofc(batch_close_date)
	format sbatch_close_date %td
	gen batch_length = sbatch_close_date - sbatch_start_date
	drop sbatch_close_date sbatch_start_date
	tab batch_length
	

**** log salaries *****

	gen logd_salary = log(d_salary)
	gen logdf_salary = log(df_salary)
	gen logs_salary = log(s_salary)
	
**** Hired recruitment process dummies ****

	*  dummy for receiving an IVR request
	gen d_sent = 0
	replace d_sent = 1 if !mi(d_sent_date)
	label value d_sent dummy
	tab d_sent, mi
 
	* dummy for acceptance of IVR request (conditional on receiving one)
	gen d_accept = .
	replace d_accept = 0 if d_sent == 1
	replace d_accept = 1 if !mi(d_accept_date)
	label value d_accept dummy
	tab d_accept, mi

	* dummy for sending a final job offer (conditional on interviewing)
	gen df_sent = .
	replace df_sent = 0  if d_accept == 1
	replace df_sent = 1  if !mi(df_sent_date) | offer_status == 2 
	label value df_sent dummy
	tab df_sent, mi

	* dummy for accepted final job offer (conditional on final offer sent)
	gen df_accept = .
	replace df_accept = 0 if !mi(df_sent_date)
	replace df_accept = 1 if !mi(df_salary) | offer_status == 2 /* "hired"*/
	label value df_accept dummy
	tab df_accept, mi

**** Dummies at the job level: **** 

	* 1) whether a job ends up being filled 
		bys jobid: egen df_max = max(df_salary) if !mi(jobid)
		gen offered = 1 if offer_status == 2 
		bys jobid: egen offered_max = max(offered) if !mi(jobid)
		* d_hire combines these 2 conditions at the jobid level
		gen d_hire = 0 if !mi(d_salary)
		replace d_hire = 1 if !mi(df_max) | !mi(offered_max)
		drop offered df_max offered_max
		tab d_hire, mi

	* 2) whether a job has made a final offer
		bys jobid: egen df_sent_date_max = max(df_sent_date) if !mi(jobid)
		gen d_offer = 0 if !mi(d_salary)
		replace d_offer = 1 if !mi(df_sent_date_max)
		drop df_sent_date_max
		tab d_offer, mi

	* 3) whether a candidate ends up being hired on Hired.com (same process at the sid level)
		bys sid: egen df_max = max(df_salary) if !mi(sid)
		gen offered = 1 if offer_status == 2 
		bys sid: egen offered_max = max(offered) if !mi(sid)
		gen s_hired = 0
		replace s_hired = 1 if !mi(df_max) | !mi(offered_max)
		drop df_max offered_max offered
		tab s_hired, mi

**** regression controls ****

	* number of previous spells
	bys sid (batch_start_date): gen nbpastbatch = 1 if batch_start_date[_n] !=  batch_start_date[_n-1]
	replace nbpastbatch = 0 if mi(nbpastbatch)
	bys sid (batch_start_date): replace nbpastbatch = sum(nbpastbatch)
	replace nbpastbatch = nbpastbatch -1
	replace nbpastbatch = 4 if nbpastbatch >4 & !mi(nbpastbatch) /* top code after 4 past spells*/
	tab nbpastbatch, mi

	* square of experience
	gen s_total_exp2 = s_total_exp^2

	* chronological position of the job offer in the candidate offer list
	sort batchid d_sent_date
	bys batchid: gen s_job_position = _n - 1
	replace s_job_position = 20 if s_job_position > 20 & !mi(s_job_position) /* top code after 20 offers*/

	* days_unemployed adjustment
	gen employed = 1 - unemployed
	replace days_unemployed = 0 if mi(days_unemployed) & employed == 1

	
**** Further variable recoding/cleaning ****

	* General assumption: when CV entry is missing, we assume that the candidate
	*					  doesn't have the corresponding skill

		* i. Preferences
		foreach var of varlist wishind_* csize* cpath_* cgoal_* wishskill_*{
			replace `var' = 0 if mi(`var')
		}

		* ii. FAANG dummy 
		replace faang = 0 if mi(faang)
		
		* iii. ivyplus degree 
		replace ivyplus = 0 if mi(ivyplus)
		
		* iv. skills
		local varlist blockchain java excel leadership projectmanagement html ///
		python javascript adobe product businessdevelopment sql socialmedia   ///
		enterprise dataanalysis pointnet c customerservice php ios android 	  ///
		chash marketing r saas design ruby nodejs machinelearning linux sales ///
		agile css react midmarket testing sketch strategy angular smb autocad ///
		swift communication leadgeneration businessanalysis testautomation 	  ///
		webdesign recruiting digitalstrategy go tableau matlab uidesign amazon ///
		salesforce hadoop asp sas devops perl vba scala financialanalysis 	  ///
		research statistics visualisation accounting accountmanagement 		  ///
		activedirectory advertising engineering algorithms analysis animation ///
		apache apex arcgis architecture 

		foreach var in `varlist' {
		replace `var' = 0 if mi(`var') & _merge == 1
		label values `var' dummy 
		}
		
		* v. Visa sponsorship
		replace s_sponsorship = 0 if mi(s_sponsorship)
		
		
* recode the s_choice_location to match the coding of the s_current_location
encode s_choice_location, generate(s_choice_location2)
drop s_choice_location
ren s_choice_location2 s_choice_location 
recode s_choice_location (15 = 1) (11=2) (9=3) (17=4) (3=5) (8=6) (2=7) (4=8) ///
						 (21=9) (1=10) (12=11) (13=11) (14=12) (20=13) (6=14) ///
						 (16=15) (19=16) (10=17) (18=18) (7=19) (5=20)
						 
						 
label define loc 1 "SF Bay Area" 2 "New York" 3 "Los Angeles" 4 "Seattle" ///
				 5 "Boston" 6 "London" 7 "Austin" 8 "Chicago" ///
				 9 "Washington D.C." 10 "Atlanta" 11 "Other" 12 "Philadelphia" ///
				 13 "Toronto" 14 "Denver" 15 "San Diego" 16 "Sydney" ///
				 17 "Melbourne" 18 "Singapore" 19 "France" 20 "Dallas / Ft Worth"

				 label values s_current_location loc
label values s_choice_location loc
tab s_choice_location

* gen aggregate experience groups
gen exp_group = 0 if s_primary_field_exp == 0 | s_primary_field_exp == 1
replace exp_group = 1 if s_primary_field_exp == 2 | s_primary_field_exp ==3 
replace exp_group = 2 if  s_primary_field_exp == 4 | s_primary_field_exp == 5 
label define exp_grouplab 0 "0-4" 1 "4-10" 2 "10+" 
label values exp_group exp_grouplab

label define gender 0 "Male" 1 "Female" 2 "unknown"
label value female gender 	


********************************************************************************
** 						V/ Variable  labeling  							 	  **
********************************************************************************

* candidate information
label var sid 					"candidate id"
label var batchid				"candidate X spell id"

label var s_gender 				"self-declared gender of the candidate"
label var s_gender_predict 		"predicted gender of the candidate using first name and sex-machine python library"
* note the female variable takes the self declared value if it exists, otherwise the predicted value
label var female 			    "=0 if candidate is male, 1 if female, 2 if unknown"
label var africanamerican 		"=1 if self-declared African American"
label var asian 				"=1 if self-declared Asian"
label var hispanic 				"=1 if self-declared Hispanic"
label var s_current_location	"current city of the candidate"
label var s_choice_location 	"desired city of the candidate"
label var s_primary_field 		"categorical occupation of the candidate (software engineering, marketing etc) "
label var s_primary_field_exp 	"experience in the occupation"
label var exp_group 			"aggregate experience in the occupation"
label var s_total_exp 			"total experience in years"
label var s_total_exp2 			"squared total experience in years"
label var s_experience			"(categorical) total experience (0-4, 4-6, etc)"
label var s_contract 			"contract preference (part-time / full-time) "
label var s_search_status 		"intensity of job search (just browsing vs actively looking)"
label var s_sponsorship 		"=1 if the candidate needs visa sponsorship"
label var s_nb_reports		    "categorical number of people who report to the candidate in their current job (0, 1-5, 5-10 etc)"
label var s_remote 				"preference for remote work (yes/no/only)"
label var days_unemployed 		"number of days in unemployment (=0 if employed)"
label var months_unemployed 	"number of months in unemployment (=0 if employed)"
label var unemployed 	 		"=1 if unemployed"
label var employed 	 		    "=1 if employed"
label var cat_degree 			"categorical variable of degree achieved (high school, ..., phd)"
label var csdegree 				"=1 if has a cs degree"
label var ivyplus				"=1 if ever went to an ivy+ school"
label var faang 				"=1 if candidate ever worked at a FAANG (Facebook, Amazon etc)"
label var s_grad_year 			"year the candidate graduated"
label var avg_educ_qual			"average ranking of all attended universities"
label var ranking				"categorical avg ranking of all attended universities"
label var s_salary 				"ask salary"
label var logs_salary 			"log ask salary"
label var highest_jobtitle		"highest job title of all listed jobs on CV"
label var linkedin				"=1 if LinkedIn profile is included on Hired"
label var website				"=1 if link to pers. website is included on Hired"

* spell information
label var companyid				"company id"
label var employerid			"recruiter within a company id"
label var jobid					"job id"

label var batch_start_date 	    "start of spell on the platform (date)"
label var batch_close_date 	    "end of spell on the platform (date)"
label var d_sent_date			"date the bid is sent"
label var d_accept_date 		"date the bid is accepted"
label var df_sent_date			"date the final offer is sent"

label var d_salary 				"bid salary"
label var logd_salary 			"log bid salary"
label var equity 				"=1 if bid contains equity"
label var d_location			"city of the job"
label var df_salary 			"(final) offer salary"

label var offer_status 			"stage of the recruitment process"
label var df_salary				"final offer salary"
label var logdf_salary			"log final offer salary"
label var s_rejection 			"reason for candidate's bid/offer rejection"
label var d_rejection 			"reason for company's candidare rejection"

label var nbdays			    "number of days in the current spell before this bid"
label var batch_length 			"lenght (in days) of the current spell"
label var nbpastbatch 			"number of spells on the platform before the current one"

* constructed data
label var s_job_position 		"number of previous offers received by the candidate"
label var d_sent 				"=1 if for this job a bid was sent"
label var d_hire 				"=1 if this job has a final hire"
label var d_offer				"=1 if this job has a final offer"
label var d_accept				"=1 if this job has an accepted offer"
label var df_sent				"=1 if this job has sent a final offer"
label var df_accept				"=1 if this job has an accepted offer"
label var s_hired 				"=1 if this candidate has a final offer"

* ranking info
label var company_name			"Standardised most recent company name"
label var platform_founded 		"Year the company on Hired was founded"
label var platform_worklifebalance "Indeed rating: work-life balance"
label var platform_paybenefits	"Indeed rating: pay & benefits"
label var platform_jobsecurity "Indeed rating: job security"
label var platform_management	"Indeed rating: management"
label var platform_culture 		"Indeed rating: culture"

* reform information
label var after				    "\textbf{After}"
* female after categorical variable
label var female_after 			"\textbf{Female $\times$ After}"

********************************************************************************
** 					IV/ Sample restriction on salary range 					  **
********************************************************************************

* the reason I drop the data below the 30K cutoff is to proxy for limiting the sample 
* to full time job search 
* I remove observations where the ask and bid are above 1million$, they are most likely
* entry error

count
keep if  (s_salary  > 30000 & s_salary < 999999) 
keep if  (d_salary  > 30000 & d_salary < 999999) | mi(d_salary)
keep if  (df_salary  > 30000 & df_salary < 999999) | mi(df_salary)
count
* Note that this drops only about 1.6% of the data  

sort sid  batch_start_date d_sent_date

save "$data/SGenderMetaD.dta", replace
